plot_ly() and ggplotly().plot_geo().We will work with two Starbucks datasets, one on the store locations (global) and one for the nutritional data for their food and drink items. We will do some text analysis of the menu items.
Upload an html file to Quercus and make sure the figures remain interactive.
sb_locs <- read_csv("https://raw.githubusercontent.com/JSC370/JSC370-2025/refs/heads/main/data/starbucks/starbucks-locations.csv")
## Rows: 25600 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): Store Number, Store Name, Ownership Type, Street Address, City, St...
## dbl (2): Longitude, Latitude
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
sb_nutr <- read_csv("https://raw.githubusercontent.com/JSC370/JSC370-2025/refs/heads/main/data/starbucks/starbucks-menu-nutrition.csv")
## Rows: 205 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Item, Category
## dbl (5): Calories, Fat (g), Carb. (g), Fiber (g), Protein (g)
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
usa_pop <- read_csv("https://raw.githubusercontent.com/JSC370/JSC370-2025/refs/heads/main/data/starbucks/us_state_pop.csv")
## Rows: 55 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): state
## dbl (1): population
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
usa_states<-read_csv("https://raw.githubusercontent.com/JSC370/JSC370-2025/refs/heads/main/data/starbucks/states.csv")
## Rows: 51 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): State, Abbreviation
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(sb_locs)
## # A tibble: 6 × 12
## `Store Number` `Store Name` `Ownership Type` `Street Address` City
## <chr> <chr> <chr> <chr> <chr>
## 1 47370-257954 Meritxell, 96 Licensed Av. Meritxell, … Ando…
## 2 22331-212325 Ajman Drive Thru Licensed 1 Street 69, Al… Ajman
## 3 47089-256771 Dana Mall Licensed Sheikh Khalifa … Ajman
## 4 22126-218024 Twofour 54 Licensed Al Salam Street Abu …
## 5 17127-178586 Al Ain Tower Licensed Khaldiya Area, … Abu …
## 6 17688-182164 Dalma Mall, Ground Flo… Licensed Dalma Mall, Mus… Abu …
## # ℹ 7 more variables: `State/Province` <chr>, Country <chr>, Postcode <chr>,
## # `Phone Number` <chr>, Timezone <chr>, Longitude <dbl>, Latitude <dbl>
head(sb_nutr)
## # A tibble: 6 × 7
## Item Category Calories `Fat (g)` `Carb. (g)` `Fiber (g)` `Protein (g)`
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Chonga Bagel Food 300 5 50 3 12
## 2 8-Grain Roll Food 380 6 70 7 10
## 3 Almond Croi… Food 410 22 45 3 10
## 4 Apple Fritt… Food 460 23 56 2 7
## 5 Banana Nut … Food 420 22 52 2 6
## 6 Blueberry M… Food 380 16 53 1 6
head(usa_pop)
## # A tibble: 6 × 2
## state population
## <chr> <dbl>
## 1 Alabama 4779736
## 2 Alaska 710231
## 3 Arizona 6392017
## 4 Arkansas 2915918
## 5 California 37253956
## 6 Colorado 5029196
head(usa_states)
## # A tibble: 6 × 2
## State Abbreviation
## <chr> <chr>
## 1 Alabama AL
## 2 Alaska AK
## 3 Arizona AZ
## 4 Arkansas AR
## 5 California CA
## 6 Colorado CO
sb_usa <- sb_locs |> filter(Country == "US")
sb_locs_state <- sb_usa |>
rename(state = 'State/Province') |>
group_by(state) |>
summarize(n_stores = n())
# need state abbreviations
usa_pop_abbr <-
full_join(usa_pop, usa_states,
by = join_by(state == State)
)
sb_locs_state <- full_join(sb_locs_state, usa_pop_abbr,
by = join_by(state == Abbreviation)
)
ggplotly for EDAAnswer the following questions:
Are the number of Starbucks proportional to the population of a state? (scatterplot)
Is the caloric distribution of Starbucks menu items different for drinks and food? (histogram)
What are the top 20 words in Starbucks menu items? (bar plot)
4a) Answer:
p1 <- ggplot(sb_locs_state, aes(x = population, y = n_stores, colour = state)) +
geom_point(alpha = 0.8) +
theme_bw()
ggplotly(p1)
The number of Starbucks stores generally increases with a state’s population, but it’s not perfectly proportional. Some states, like California, have way more stores than expected, likely due to factors like urbanization and demand.
p2 <- ggplot(sb_nutr, aes(x=Calories, fill=Category)) +
geom_histogram(alpha= 0.5) +
theme_bw()
ggplotly(p2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
The caloric distribution of Starbucks menu items is different for drinks and food. Drinks tend to have lower calorie counts and peak around 150 calories, while food items generally have higher calorie counts and peak around 450 calories.
p3 <- sb_nutr |>
unnest_tokens(word, Item, token = "words") |>
count(word, sort = T) |>
head(20) |>
ggplot(aes(fct_reorder(word, n), n)) +
geom_col() +
coord_flip() +
theme_bw()
ggplotly(p3)
The top 20 most common words in Starbucks menu items include “iced,” “tazo,” “bottled,” “sandwich,” “chocolate,”coffee”, and “tea”, with “iced” appearing the most frequently. This suggests a strong emphasis on cold drinks, coffees, cholocate, and popular food items like sandwiches and desserts.
plot_ly()plot_ly() representing the
relationship between calories and carbs. Color the points by category
(food or beverage). Is there a relationship, and do food or beverages
tend to have more calories?sb_nutr |>
plot_ly(x = ~Calories, y = ~`Carb. (g)`,
type = 'scatter', mode = 'markers', color = ~Category)
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
5a) Answer: There is a positive relationship between calories and carbs—items with more calories tend to have more carbs. Food items generally have higher calories than beverages, while drinks cluster more at lower calorie values.
Repeat this scatterplot but for the items that include the top 10
words. Color again by category, and add hoverinfo specifying the word in
the item name. Add layout information to title the chart and the axes,
and enable hovermode = "compare".
What are the top 10 words and is the plot much different than above?
topwords <- sb_nutr |>
unnest_tokens(word, Item, token = "words") |>
group_by(word) |>
summarise(word_frequency = n()) |>
arrange(across(word_frequency, desc)) |>
head(10)
sb_nutr |>
unnest_tokens(word, Item, token="words") |>
filter(word %in% topwords$word) |>
plot_ly(x = ~Calories, y = ~`Carb. (g)`,
type = 'scatter', mode = 'markers',
color = ~Category,
hoverinfo = 'text',
text = ~paste0("Item: ", word)
) |>
layout(
title = 'Cal vs Carbs',
xaxis = list(title = 'Calories'),
yaxis = list(title = 'Carbs'),
hovermode = 'compare'
)
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
plot_ly Boxplotssb_nutr_long <- sb_nutr |>
unnest_tokens(word, Item, token="words") |>
filter(word %in% topwords$word) |>
pivot_longer(
cols = c(Calories, `Fat (g)`, `Carb. (g)`, `Fiber (g)`, `Protein (g)`),
names_to = "Nutrient", values_to = "Value")
plot_ly(data = sb_nutr_long,
x = ~word,
y = ~Value,
color = ~Nutrient,
type = 'box'
) |>
layout(
title = "Nutrition values for the top 10 words items",
xaxis = list(title = 'Item Word'),
yaxis = list(title = 'Nutrition Value'),
hovermode = 'compare'
)
sb_nutr |>
unnest_tokens(word, Item, token = "words") |>
filter(word %in% topwords$word) |>
plot_ly(
x = ~Calories,
y = ~`Carb. (g)`,
z = ~`Protein (g)`,
color = ~word,
type = 'scatter3d',
mode = 'markers',
marker = list(size = 5)
) |>
layout(
title = "3D Scatterplot of Calories, Carbs, and Protein",
scene = list(
xaxis = list(title = "Calories"),
yaxis = list(title = "Carbohydrates (g)"),
zaxis = list(title = "Proein (g)")
)
)
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
plot_ly Map# Set up mapping details
set_map_details <- list(
scope = 'usa',
projection = list(type = 'albers usa'),
showlakes = TRUE,
lakecolor = toRGB('steelblue')
)
# Make sure both maps are on the same color scale
shadeLimit <- 125
# Create hover text
sb_locs_state$hover <- with(sb_locs_state, paste("Number of Starbucks: ", n_stores, '<br>', "State: ", state.y, '<br>', "Population: ", population))
# Create the map
map1 <- plot_geo(sb_locs_state, locationmode = "USA-states") |>
add_trace(z = ~n_stores, text = ~hover, locations = ~state,
color = ~n_stores, colors = 'Purples') |>
layout(title = "Starbucks store by state", geo = set_map_details)
map1
## Warning: Ignoring 4 observations
map2 <- plot_geo(sb_locs_state, locationmode = "USA-states") |>
add_trace(z = ~population, text = ~hover, locations = ~state,
color = ~population, colors = 'Purples') |>
layout(title = "Population by state", geo = set_map_details)
map2
subplot(map1, map2) |>
layout(
annotations = list(
list(x = 0.2, y = 1.0, text = "Starbucks Stores by State", showarrow = FALSE, xref = 'paper', yref = 'paper', font = list(size = 14, color = "black")),
list(x = 0.8, y = 1.0, text = "Population by State", showarrow = FALSE, xref = 'paper', yref = 'paper', font = list(size = 14, color = "black"))
)
)
## Warning: Ignoring 4 observations